# import packages
library(tidyverse)
library(visNetwork)
library(plotly)
library(reshape2)
library(scales)
# import data (the second sheet from the Excel file, saved it as a txt)
d = read.table("PublicTransport.txt", header=T, na.strings = "<Null>", check.names=FALSE)
dlong = melt(d[,-(1:2)], variable.name = "date") # convert to long format
# a quick look at the data - some pretty hefty outliers in there
# if 20-Apr means "April of 2020" then might be worth recoding as 2020-04 for easier ordering and such. Also check out the lubridate package.
ggplot(dlong, aes(y=value,x=date)) +
geom_boxplot(outlier.colour = rgb(0,0,0,0.4)) +
theme_minimal()
# using log scale for color, otherwise the outliers make the rest of the scale hard to read
# will temporarily mutate 0s to NAs to allow that; showing those as black
heatm = ggplot(dlong %>% mutate(value=na_if(value,0)),
aes(y=date, x=od_path, fill=value))+
geom_tile() +
scale_fill_viridis_c(trans = "log",
breaks=10^c(1:round(log10(max(dlong$value,na.rm=T)))),
label=comma, na.value = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle=90, size=4, hjust=1, vjust=0))
heatm
# interactive version (the values are on log scale because of the color scale transformation; could be trasnformed back to counts of course)
ggplotly(heatm + theme(legend.position = "none"))
So not much seasonal variation, a bit in Jul-Aug (darker bands), but a noticable corona-drop around April. Some routes are used 0 times in some months.
route="O8D7"
means = dlong %>% group_by(od_path) %>% summarise(m = mean(value, na.rm=T))
dlong$means = means$m[match(dlong$od_path, means$od_path)]
hlcol = viridis_pal(option="E")(10)[2]
ggplot(dlong, aes(x=date, y=value,color=means, group=od_path))+
geom_line( size=0.7, alpha=0.6) +
#scale_color_gradient(low="gray90",high = "gray40", limits=c(0,25000))+
scale_color_viridis_c(option="D", begin=0.3, end=1, limits=c(0,20000)) +
geom_line(data=dlong %>% subset(od_path==route), color="black", size=2) +
coord_cartesian(ylim=c(0,25000), xlim=c(1,14) ) + # leaving out the couple outliers
geom_text(aes(x=13.1, label=od_path),
subset(dlong, date=="20-Jul" & od_path==route),
size=5, hjust=0, color="black") +
theme_minimal() + theme(legend.position = "none")
Could be put on a map too, provided long/lat. Could show all routes but it’s a mess of a hairball; here just showing top popular routes in April (could also be aggregate top general routes etc). Click on nodes to highlight connections.
xdate = "20-Apr"
top20 = order(d[,xdate], na.last = F) %>% tail(20)
edges = d[top20,] %>%
mutate_at(c("Origin", "Destination"), as.character ) %>%
rename(from=Origin, to=Destination)
edges$value = edges[, xdate]
edges$color = viridis_pal(end = 0.8)(max( edges[, xdate],na.rm=T))[ edges[, xdate] ]
nodes = as.character(sort(union(edges$from, edges$to))) %>% data.frame(id=., label=.)
visNetwork(nodes=nodes, edges=edges, height = "500px", width = "100%") %>%
visNodes(size = 5, width=1, shadow=T, font=list(size = 20)) %>%
visEdges(arrows = list(to = list(enabled = TRUE, scaleFactor =0.5)), shadow=T, smooth=list(type="discrete"), selectionWidth=1.5) %>%
visOptions(highlightNearest = list(enabled = T, hover = T, degree=1, labelOnly=F, algorithm="hierarchical"), nodesIdSelection = T) %>%
visIgraphLayout("layout_in_circle", smooth=T)